library(tidyverse) 
library(NbClust)
library(factoextra)
library(cluster)
library(GGally)

Dataset Information

The dataset traveler_reviews.csv was gathered from destination reviews published by 249 reviewers of holidayiq.com till October 2014. Reviews falling in 6 categories among destinations across South India were considered and the count of reviews in each category for every reviewer (traveler) is captured.

The HolidayIQ Club

The HolidayIQ Club

Attribute Information:
- User Id: Unique user id
- Sports: Number of reviews on stadiums, sports complex, etc.
- Religious: Number of reviews on religious institutions
- Nature: Number of reviews on beach, lake, river, etc.
- Theatre: Number of reviews on theatres, exhibitions, etc.
- Shopping: Number of reviews on malls, shopping places, etc.
- Picnic: Number of reviews on parks, picnic spots, etc.

Mysore Palace in South India

Mysore Palace in South India

Alleppey Backwaters in South India

Alleppey Backwaters in South India

Data Preparation

d <- read_csv('traveler_reviews.csv')
glimpse(d)
## Observations: 249
## Variables: 7
## $ `User Id` <chr> "User 1", "User 2", "User 3", "User 4", "User 5", "U...
## $ Sports    <dbl> 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5...
## $ Religious <dbl> 77, 62, 50, 68, 98, 52, 64, 54, 64, 86, 107, 103, 64...
## $ Nature    <dbl> 79, 76, 97, 77, 54, 109, 85, 107, 108, 76, 54, 60, 8...
## $ Theatre   <dbl> 69, 76, 87, 95, 59, 93, 82, 92, 64, 74, 64, 63, 82, ...
## $ Shopping  <dbl> 68, 69, 50, 76, 95, 52, 73, 54, 54, 74, 103, 102, 75...
## $ Picnic    <dbl> 95, 68, 75, 61, 86, 76, 69, 76, 93, 103, 94, 93, 69,...
summary(d)
##    User Id              Sports        Religious         Nature     
##  Length:249         Min.   : 2.00   Min.   : 50.0   Min.   : 52.0  
##  Class :character   1st Qu.: 6.00   1st Qu.: 84.0   1st Qu.: 89.0  
##  Mode  :character   Median :12.00   Median :104.0   Median :119.0  
##                     Mean   :11.99   Mean   :109.8   Mean   :124.5  
##                     3rd Qu.:18.00   3rd Qu.:132.0   3rd Qu.:153.0  
##                     Max.   :25.00   Max.   :203.0   Max.   :318.0  
##     Theatre         Shopping         Picnic     
##  Min.   : 59.0   Min.   : 50.0   Min.   : 61.0  
##  1st Qu.: 93.0   1st Qu.: 79.0   1st Qu.: 92.0  
##  Median :113.0   Median :104.0   Median :119.0  
##  Mean   :116.4   Mean   :112.6   Mean   :120.4  
##  3rd Qu.:138.0   3rd Qu.:138.0   3rd Qu.:143.0  
##  Max.   :213.0   Max.   :233.0   Max.   :218.0
d_scaled <- d %>% select(-`User Id`) %>% scale()
head(d_scaled)
##         Sports  Religious     Nature    Theatre   Shopping     Picnic
## [1,] -1.509552 -1.0100142 -0.9973422 -1.4744331 -1.0740003 -0.7783943
## [2,] -1.509552 -1.4722052 -1.0630749 -1.2565864 -1.0499404 -1.6057691
## [3,] -1.509552 -1.8419580 -0.6029459 -0.9142560 -1.5070790 -1.3912645
## [4,] -1.509552 -1.2873288 -1.0411640 -0.6652884 -0.8815209 -1.8202736
## [5,] -1.509552 -0.3629468 -1.5451149 -1.7856426 -0.4243823 -1.0541859
## [6,] -1.358415 -1.7803325 -0.3400150 -0.7275303 -1.4589591 -1.3606210
d_scaled_tall <- d_scaled %>% as_tibble() %>% gather(key = "variable")
ggplot(data = d_scaled_tall, aes(x = variable, y = value)) +
  geom_boxplot()

fviz_nbclust(d_scaled, kmeans, method = "wss") +
    geom_vline(xintercept = 4, linetype = 2) +
  labs(subtitle = "Elbow method")

fviz_nbclust(d_scaled, kmeans, method = "silhouette") +
  labs(subtitle = "Silhouette method")

set.seed(123)
fviz_nbclust(d_scaled, kmeans, method = "gap_stat", nboot = 50)+
  labs(subtitle = "Gap statistic method")

K-means clustering

set.seed(123)
km_clusters <- kmeans(d_scaled, centers = 3, nstart = 25)
fviz_cluster(km_clusters, data = d_scaled)

km_clusters$size
## [1]  39 113  97
km_clusters$centers %>% t() %>% round(3)
##                1      2     3
## Sports     0.595 -0.917 0.829
## Religious  1.608 -0.659 0.121
## Nature    -0.832 -0.509 0.927
## Theatre   -0.183 -0.569 0.736
## Shopping   1.668 -0.616 0.047
## Picnic     0.562 -0.790 0.694
centroids_tall <- km_clusters$centers %>% 
  as_tibble() %>% 
  mutate(cluster = rownames(km_clusters$centers)) %>% 
  gather(key = var, value = value, Sports:Picnic)

ggplot(data = centroids_tall, 
       aes(x = cluster, y = value, fill = cluster)) +
  geom_bar(stat = "identity") + 
  facet_wrap(~ var, ncol = 3)

sil <- silhouette(km_clusters$cluster, dist(d_scaled))
fviz_silhouette(sil)
##   cluster size ave.sil.width
## 1       1   39          0.38
## 2       2  113          0.47
## 3       3   97          0.22

d_mutated <- d %>% 
  mutate(Total_reviews = Sports + Religious + Nature + Theatre + Shopping + Picnic,
         Sports = Sports/Total_reviews,
         Religious = Religious/Total_reviews,
         Nature = Nature/Total_reviews,
         Theatre = Theatre/Total_reviews,
         Shopping = Shopping/Total_reviews,
         Picnic = Picnic/Total_reviews)

rownames(d_mutated) <- d_mutated$`User Id`
d_mutated <- d_mutated %>% select(-`User Id`) %>% scale()
head(d_mutated)
##           Sports  Religious     Nature    Theatre   Shopping     Picnic
## User 1 -1.880073  0.3522515 -0.1201255 -0.4723631 -0.2730941  1.4201734
## User 2 -1.805406 -0.2400670  0.0874366  0.4113481  0.1551331 -0.3063525
## User 3 -1.822847 -1.2490676  0.9579022  1.0031802 -1.0004809  0.2060185
## User 4 -1.859398 -0.1372973 -0.1103104  1.2256926  0.2577910 -1.3799321
## User 5 -1.887305  1.7460634 -1.1879540 -1.0982110  1.0812211  0.5623982
## User 6 -1.510017 -1.3425196  1.1929387  1.0161042 -1.0702523 -0.1447961
##        Total_reviews
## User 1     -1.604465
## User 2     -1.893061
## User 3     -1.830662
## User 4     -1.690264
## User 5     -1.573265
## User 6     -1.643464
cor(d_mutated) %>% round(3)
##               Sports Religious Nature Theatre Shopping Picnic
## Sports         1.000    -0.183  0.097  -0.128   -0.035  0.029
## Religious     -0.183     1.000 -0.899  -0.427    0.774  0.002
## Nature         0.097    -0.899  1.000   0.178   -0.854  0.183
## Theatre       -0.128    -0.427  0.178   1.000   -0.389 -0.628
## Shopping      -0.035     0.774 -0.854  -0.389    1.000 -0.279
## Picnic         0.029     0.002  0.183  -0.628   -0.279  1.000
## Total_reviews  0.947    -0.024 -0.069  -0.223    0.120  0.070
##               Total_reviews
## Sports                0.947
## Religious            -0.024
## Nature               -0.069
## Theatre              -0.223
## Shopping              0.120
## Picnic                0.070
## Total_reviews         1.000
d_mutated <- as_tibble(d_mutated) %>% select(-Shopping)
cl_nb <- NbClust(data = d_mutated, distance = "euclidean",
                  min.nc = 2, max.nc = 9, 
                  method = "kmeans", index = "all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 4 proposed 3 as the best number of clusters 
## * 3 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 4 proposed 6 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 4 proposed 8 as the best number of clusters 
## * 2 proposed 9 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
fviz_nbclust(cl_nb)
## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 1 proposed  1 as the best number of clusters
## * 4 proposed  2 as the best number of clusters
## * 4 proposed  3 as the best number of clusters
## * 3 proposed  4 as the best number of clusters
## * 1 proposed  5 as the best number of clusters
## * 4 proposed  6 as the best number of clusters
## * 1 proposed  7 as the best number of clusters
## * 4 proposed  8 as the best number of clusters
## * 2 proposed  9 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

set.seed(127)
km_clusters2 <- kmeans(d_mutated, centers = 3, nstart = 25) 
fviz_cluster(km_clusters2, data = d_mutated)

km_clusters2$size
## [1] 86 78 85
km_clusters2$centers %>% t() %>% round(3)
##                    1      2      3
## Sports        -0.762  1.028 -0.172
## Religious     -0.434 -0.696  1.077
## Nature         0.394  0.668 -1.011
## Theatre        0.686 -0.050 -0.649
## Picnic        -0.464  0.210  0.277
## Total_reviews -0.854  0.906  0.033
centroids_tall2 <- km_clusters2$centers %>% 
  as_tibble() %>% 
  mutate(cluster = rownames(km_clusters2$centers)) %>% 
  gather(key = var, value = value, Sports:Total_reviews)
ggplot(data = centroids_tall2, 
       aes(x = cluster, y = value, fill = cluster)) +
  geom_bar(stat = "identity") + 
  facet_wrap(~ var, ncol = 3)

sil <- silhouette(km_clusters2$cluster, dist(d_mutated))
fviz_silhouette(sil)
##   cluster size ave.sil.width
## 1       1   86          0.30
## 2       2   78          0.27
## 3       3   85          0.23

Hierarchical clustering

set.seed(127)
hclust_complete_sample <- eclust(sample(as.matrix(d_mutated), 40), FUNcluster="hclust", hc_method="complete", k = 3)
fviz_dend(hclust_complete_sample, show_labels=T, main = 'Dendrogram using complete linkage')

hclust_complete <- eclust(d_mutated, FUNcluster="hclust", hc_method="complete", k = 4)
fviz_dend(hclust_complete, show_labels=F, main = 'Dendrogram using complete linkage')

fviz_cluster(hclust_complete)

fviz_silhouette(hclust_complete)
##   cluster size ave.sil.width
## 1       1   77          0.17
## 2       2   56          0.19
## 3       3   52          0.31
## 4       4   64          0.26

cbind(d_mutated, cluster = hclust_complete$cluster) %>%
  as_tibble() %>%
  gather(key = var, value = value, Sports:Total_reviews) %>%
  group_by(cluster, var) %>%
  summarize(value = mean(value)) %>%
  ggplot(aes(x = cluster, y = value, fill = as.character(cluster))) +
  geom_bar(stat = "identity") + 
  facet_wrap(~ var, ncol = 3)

hclust_single <- eclust(d_mutated, FUNcluster="hclust", hc_method = "single", k=4) 
fviz_dend(hclust_single, show_labels=F, main = 'Dendrogram using single linkage')

fviz_cluster(hclust_single)

fviz_silhouette(hclust_single)
##   cluster size ave.sil.width
## 1       1  246         -0.18
## 2       2    1          0.00
## 3       3    1          0.00
## 4       4    1          0.00

hclust_average <- eclust(d_mutated, FUNcluster="hclust", hc_method ="average",k=4)
fviz_dend(hclust_average, show_labels=F, main = 'Dendrogram using average linkage')

fviz_cluster(hclust_average)

fviz_silhouette(hclust_average)
##   cluster size ave.sil.width
## 1       1   53          0.27
## 2       2  102          0.17
## 3       3   52          0.26
## 4       4   42          0.34